/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#include "precomp.hpp"

/****************************************************************************************\
                                    Base Image Filter
\****************************************************************************************/

namespace cv {

BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
BaseRowFilter::~BaseRowFilter() {}

BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
BaseColumnFilter::~BaseColumnFilter() {}
void BaseColumnFilter::reset() {}

BaseFilter::BaseFilter() { ksize = Size(-1, -1); anchor = Point(-1, -1); }
BaseFilter::~BaseFilter() {}
void BaseFilter::reset() {}

/*
 Various border types, image boundaries are denoted with '|'

    * BORDER_REPLICATE:     aaaaaa|abcdefgh|hhhhhhh
    * BORDER_REFLECT:       fedcba|abcdefgh|hgfedcb
    * BORDER_REFLECT_101:   gfedcb|abcdefgh|gfedcba
    * BORDER_WRAP:          cdefgh|abcdefgh|abcdefg
    * BORDER_CONSTANT:      iiiiii|abcdefgh|iiiiiii  with some specified 'i'
*/
int borderInterpolate( int p, int len, int borderType ) {
	if ( (unsigned)p < (unsigned)len )
		;
	else if ( borderType == BORDER_REPLICATE ) {
		p = p < 0 ? 0 : len - 1;
	} else if ( borderType == BORDER_REFLECT || borderType == BORDER_REFLECT_101 ) {
		int delta = borderType == BORDER_REFLECT_101;
		if ( len == 1 ) {
			return 0;
		}
		do {
			if ( p < 0 ) {
				p = -p - 1 + delta;
			} else {
				p = len - 1 - (p - len) - delta;
			}
		} while ( (unsigned)p >= (unsigned)len );
	} else if ( borderType == BORDER_WRAP ) {
		if ( p < 0 ) {
			p -= ((p - len + 1) / len) * len;
		}
		if ( p >= len ) {
			p %= len;
		}
	} else if ( borderType == BORDER_CONSTANT ) {
		p = -1;
	} else {
		CV_Error( CV_StsBadArg, "Unknown/unsupported border type" );
	}
	return p;
}


FilterEngine::FilterEngine() {
	srcType = dstType = bufType = -1;
	rowBorderType = columnBorderType = BORDER_REPLICATE;
	bufStep = startY = startY0 = endY = rowCount = dstY = 0;
	maxWidth = 0;

	wholeSize = Size(-1, -1);
}


FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
							const Ptr<BaseRowFilter>& _rowFilter,
							const Ptr<BaseColumnFilter>& _columnFilter,
							int _srcType, int _dstType, int _bufType,
							int _rowBorderType, int _columnBorderType,
							const Scalar& _borderValue ) {
	init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
		 _rowBorderType, _columnBorderType, _borderValue);
}

FilterEngine::~FilterEngine() {
}


void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
						 const Ptr<BaseRowFilter>& _rowFilter,
						 const Ptr<BaseColumnFilter>& _columnFilter,
						 int _srcType, int _dstType, int _bufType,
						 int _rowBorderType, int _columnBorderType,
						 const Scalar& _borderValue ) {
	_srcType = CV_MAT_TYPE(_srcType);
	_bufType = CV_MAT_TYPE(_bufType);
	_dstType = CV_MAT_TYPE(_dstType);

	srcType = _srcType;
	int srcElemSize = (int)getElemSize(srcType);
	dstType = _dstType;
	bufType = _bufType;

	filter2D = _filter2D;
	rowFilter = _rowFilter;
	columnFilter = _columnFilter;

	if ( _columnBorderType < 0 ) {
		_columnBorderType = _rowBorderType;
	}

	rowBorderType = _rowBorderType;
	columnBorderType = _columnBorderType;

	CV_Assert( columnBorderType != BORDER_WRAP );

	if ( isSeparable() ) {
		CV_Assert( !rowFilter.empty() && !columnFilter.empty() );
		ksize = Size(rowFilter->ksize, columnFilter->ksize);
		anchor = Point(rowFilter->anchor, columnFilter->anchor);
	} else {
		CV_Assert( bufType == srcType );
		ksize = filter2D->ksize;
		anchor = filter2D->anchor;
	}

	CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
			   0 <= anchor.y && anchor.y < ksize.height );

	borderElemSize = srcElemSize / (CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
	borderTab.resize( std::max(ksize.width - 1, 1)*borderElemSize);

	maxWidth = bufStep = 0;
	constBorderRow.clear();

	if ( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT ) {
		constBorderValue.resize(srcElemSize*(ksize.width - 1));
		scalarToRawData(_borderValue, &constBorderValue[0], srcType,
						(ksize.width - 1)*CV_MAT_CN(srcType));
	}

	wholeSize = Size(-1, -1);
}

static const int VEC_ALIGN = CV_MALLOC_ALIGN;

int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows) {
	int i, j;

	wholeSize = _wholeSize;
	roi = _roi;
	CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
			   roi.x + roi.width <= wholeSize.width &&
			   roi.y + roi.height <= wholeSize.height );

	int esz = (int)getElemSize(srcType);
	int bufElemSize = (int)getElemSize(bufType);
	const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;

	if ( _maxBufRows < 0 ) {
		_maxBufRows = ksize.height + 3;
	}
	_maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height - anchor.y - 1) * 2 + 1);

	if ( maxWidth < roi.width || _maxBufRows != (int)rows.size() ) {
		rows.resize(_maxBufRows);
		maxWidth = std::max(maxWidth, roi.width);
		int cn = CV_MAT_CN(srcType);
		srcRow.resize(esz*(maxWidth + ksize.width - 1));
		if ( columnBorderType == BORDER_CONSTANT ) {
			constBorderRow.resize(getElemSize(bufType)*(maxWidth + VEC_ALIGN));
			uchar* dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
			int n = (int)constBorderValue.size(), N;
			if ( isSeparable() ) {
				tdst = &srcRow[0];
				N = (maxWidth + ksize.width - 1) * esz;
			} else {
				tdst = dst;
				N = maxWidth * esz;
			}

			for ( i = 0; i < N; i += n ) {
				n = std::min( n, N - i );
				for (j = 0; j < n; j++) {
					tdst[i+j] = constVal[j];
				}
			}

			if ( isSeparable() ) {
				(*rowFilter)(&srcRow[0], dst, maxWidth, cn);
			}
		}

		int maxBufStep = bufElemSize * (int)alignSize(maxWidth +
						 (!isSeparable() ? ksize.width - 1 : 0), VEC_ALIGN);
		ringBuf.resize(maxBufStep * rows.size() + VEC_ALIGN);
	}

	// adjust bufstep so that the used part of the ring buffer stays compact in memory
	bufStep = bufElemSize * (int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0), 16);

	dx1 = std::max(anchor.x - roi.x, 0);
	dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);

	// recompute border tables
	if ( dx1 > 0 || dx2 > 0 ) {
		if ( rowBorderType == BORDER_CONSTANT ) {
			int nr = isSeparable() ? 1 : (int)rows.size();
			for ( i = 0; i < nr; i++ ) {
				uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0], VEC_ALIGN) + bufStep * i;
				memcpy( dst, constVal, dx1 * esz );
				memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2 * esz );
			}
		} else {
			int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
			int* btab = (int*)&borderTab[0];

			for ( i = 0; i < dx1; i++ ) {
				int p0 = borderInterpolate(i - dx1, wholeWidth, rowBorderType) * btab_esz;
				for ( j = 0; j < btab_esz; j++ ) {
					btab[i* btab_esz + j] = p0 + j;
				}
			}

			for ( i = 0; i < dx2; i++ ) {
				int p0 = borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) * btab_esz;
				for ( j = 0; j < btab_esz; j++ ) {
					btab[(i + dx1)*btab_esz + j] = p0 + j;
				}
			}
		}
	}

	rowCount = dstY = 0;
	startY = startY0 = std::max(roi.y - anchor.y, 0);
	endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
	if ( !columnFilter.empty() ) {
		columnFilter->reset();
	}
	if ( !filter2D.empty() ) {
		filter2D->reset();
	}

	return startY;
}


int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
						bool isolated, int maxBufRows) {
	Rect srcRoi = _srcRoi;

	if ( srcRoi == Rect(0, 0, -1, -1) ) {
		srcRoi = Rect(0, 0, src.cols, src.rows);
	}

	CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
			   srcRoi.width >= 0 && srcRoi.height >= 0 &&
			   srcRoi.x + srcRoi.width <= src.cols &&
			   srcRoi.y + srcRoi.height <= src.rows );

	Point ofs;
	Size wholeSize(src.cols, src.rows);
	if ( !isolated ) {
		src.locateROI( wholeSize, ofs );
	}
	start( wholeSize, srcRoi + ofs, maxBufRows );

	return startY - ofs.y;
}


int FilterEngine::remainingInputRows() const {
	return endY - startY - rowCount;
}

int FilterEngine::remainingOutputRows() const {
	return roi.height - dstY;
}

int FilterEngine::proceed( const uchar* src, int srcstep, int count,
						   uchar* dst, int dststep ) {
	CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );

	const int* btab = &borderTab[0];
	int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
	uchar** brows = &rows[0];
	int bufRows = (int)rows.size();
	int cn = CV_MAT_CN(bufType);
	int width = roi.width, kwidth = ksize.width;
	int kheight = ksize.height, ay = anchor.y;
	int _dx1 = dx1, _dx2 = dx2;
	int width1 = roi.width + kwidth - 1;
	int xofs1 = std::min(roi.x, anchor.x);
	bool isSep = isSeparable();
	bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
	int dy = 0, i = 0;

	src -= xofs1 * esz;
	count = std::min(count, remainingInputRows());

	CV_Assert( src && dst && count > 0 );

	for (;; dst += dststep* i, dy += i) {
		int dcount = bufRows - ay - startY - rowCount + roi.y;
		dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
		dcount = std::min(dcount, count);
		count -= dcount;
		for ( ; dcount-- > 0; src += srcstep ) {
			int bi = (startY - startY0 + rowCount) % bufRows;
			uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi * bufStep;
			uchar* row = isSep ? &srcRow[0] : brow;

			if ( ++rowCount > bufRows ) {
				--rowCount;
				++startY;
			}

			memcpy( row + _dx1 * esz, src, (width1 - _dx2 - _dx1)*esz );

			if ( makeBorder ) {
				if ( btab_esz*(int)sizeof(int) == esz ) {
					const int* isrc = (const int*)src;
					int* irow = (int*)row;

					for ( i = 0; i < _dx1 * btab_esz; i++ ) {
						irow[i] = isrc[btab[i]];
					}
					for ( i = 0; i < _dx2 * btab_esz; i++ ) {
						irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
					}
				} else {
					for ( i = 0; i < _dx1 * esz; i++ ) {
						row[i] = src[btab[i]];
					}
					for ( i = 0; i < _dx2 * esz; i++ ) {
						row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
					}
				}
			}

			if ( isSep ) {
				(*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
			}
		}

		int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
		for ( i = 0; i < max_i; i++ ) {
			int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
										 wholeSize.height, columnBorderType);
			if ( srcY < 0 ) { // can happen only with constant border type
				brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
			} else {
				CV_Assert( srcY >= startY );
				if ( srcY >= startY + rowCount ) {
					break;
				}
				int bi = (srcY - startY0) % bufRows;
				brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi * bufStep;
			}
		}
		if ( i < kheight ) {
			break;
		}
		i -= kheight - 1;
		if ( isSeparable() ) {
			(*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width * cn);
		} else {
			(*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
		}
	}

	dstY += dy;
	CV_Assert( dstY <= roi.height );
	return dy;
}


void FilterEngine::apply(const Mat& src, Mat& dst,
						 const Rect& _srcRoi, Point dstOfs, bool isolated) {
	CV_Assert( src.type() == srcType && dst.type() == dstType );

	Rect srcRoi = _srcRoi;
	if ( srcRoi == Rect(0, 0, -1, -1) ) {
		srcRoi = Rect(0, 0, src.cols, src.rows);
	}

	CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
			   dstOfs.x + srcRoi.width <= dst.cols &&
			   dstOfs.y + srcRoi.height <= dst.rows );

	int y = start(src, srcRoi, isolated);
	proceed( src.data + y * src.step, (int)src.step, endY - startY,
			 dst.data + dstOfs.y * dst.step + dstOfs.x * dst.elemSize(), (int)dst.step );
}


/****************************************************************************************\
*                                 Separable linear filter                                *
\****************************************************************************************/

int getKernelType(const Mat& _kernel, Point anchor) {
	CV_Assert( _kernel.channels() == 1 );
	int i, sz = _kernel.rows * _kernel.cols;

	Mat kernel;
	_kernel.convertTo(kernel, CV_64F);

	const double* coeffs = (double*)kernel.data;
	double sum = 0;
	int type = KERNEL_SMOOTH + KERNEL_INTEGER;
	if ( (_kernel.rows == 1 || _kernel.cols == 1) &&
			anchor.x * 2 + 1 == _kernel.cols &&
			anchor.y * 2 + 1 == _kernel.rows ) {
		type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
	}

	for ( i = 0; i < sz; i++ ) {
		double a = coeffs[i], b = coeffs[sz - i - 1];
		if ( a != b ) {
			type &= ~KERNEL_SYMMETRICAL;
		}
		if ( a != -b ) {
			type &= ~KERNEL_ASYMMETRICAL;
		}
		if ( a < 0 ) {
			type &= ~KERNEL_SMOOTH;
		}
		if ( a != saturate_cast<int>(a) ) {
			type &= ~KERNEL_INTEGER;
		}
		sum += a;
	}

	if ( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) ) {
		type &= ~KERNEL_SMOOTH;
	}
	return type;
}


struct RowNoVec {
	RowNoVec() {}
	RowNoVec(const Mat&) {}
	int operator()(const uchar*, uchar*, int, int) const { return 0; }
};

struct ColumnNoVec {
	ColumnNoVec() {}
	ColumnNoVec(const Mat&, int, int, double) {}
	int operator()(const uchar**, uchar*, int) const { return 0; }
};

struct SymmRowSmallNoVec {
	SymmRowSmallNoVec() {}
	SymmRowSmallNoVec(const Mat&, int) {}
	int operator()(const uchar*, uchar*, int, int) const { return 0; }
};

struct SymmColumnSmallNoVec {
	SymmColumnSmallNoVec() {}
	SymmColumnSmallNoVec(const Mat&, int, int, double) {}
	int operator()(const uchar**, uchar*, int) const { return 0; }
};

struct FilterNoVec {
	FilterNoVec() {}
	FilterNoVec(const Mat&, int, double) {}
	int operator()(const uchar**, uchar*, int) const { return 0; }
};


#if CV_SSE2

///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////

struct RowVec_8u32s {
	RowVec_8u32s() { smallValues = false; }
	RowVec_8u32s( const Mat& _kernel ) {
		kernel = _kernel;
		smallValues = true;
		int k, ksize = kernel.rows + kernel.cols - 1;
		for ( k = 0; k < ksize; k++ ) {
			int v = ((const int*)kernel.data)[k];
			if ( v < SHRT_MIN || v > SHRT_MAX ) {
				smallValues = false;
				break;
			}
		}
	}

	int operator()(const uchar* _src, uchar* _dst, int width, int cn) const {
		if ( !checkHardwareSupport(CV_CPU_SSE2) ) {
			return 0;
		}

		int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
		int* dst = (int*)_dst;
		const int* _kx = (const int*)kernel.data;
		width *= cn;

		if ( smallValues ) {
			for ( ; i <= width - 16; i += 16 ) {
				const uchar* src = _src + i;
				__m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
				__m128i x0, x1, x2, x3;

				for ( k = 0; k < _ksize; k++, src += cn ) {
					f = _mm_cvtsi32_si128(_kx[k]);
					f = _mm_shuffle_epi32(f, 0);
					f = _mm_packs_epi32(f, f);

					x0 = _mm_loadu_si128((const __m128i*)src);
					x2 = _mm_unpackhi_epi8(x0, z);
					x0 = _mm_unpacklo_epi8(x0, z);
					x1 = _mm_mulhi_epi16(x0, f);
					x3 = _mm_mulhi_epi16(x2, f);
					x0 = _mm_mullo_epi16(x0, f);
					x2 = _mm_mullo_epi16(x2, f);

					s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
					s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
					s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
					s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
				}

				_mm_store_si128((__m128i*)(dst + i), s0);
				_mm_store_si128((__m128i*)(dst + i + 4), s1);
				_mm_store_si128((__m128i*)(dst + i + 8), s2);
				_mm_store_si128((__m128i*)(dst + i + 12), s3);
			}

			for ( ; i <= width - 4; i += 4 ) {
				const uchar* src = _src + i;
				__m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;

				for ( k = 0; k < _ksize; k++, src += cn ) {
					f = _mm_cvtsi32_si128(_kx[k]);
					f = _mm_shuffle_epi32(f, 0);
					f = _mm_packs_epi32(f, f);

					x0 = _mm_cvtsi32_si128(*(const int*)src);
					x0 = _mm_unpacklo_epi8(x0, z);
					x1 = _mm_mulhi_epi16(x0, f);
					x0 = _mm_mullo_epi16(x0, f);
					s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
				}
				_mm_store_si128((__m128i*)(dst + i), s0);
			}
		}
		return i;
	}

	Mat kernel;
	bool smallValues;
};


struct SymmRowSmallVec_8u32s {
	SymmRowSmallVec_8u32s() { smallValues = false; }
	SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType ) {
		kernel = _kernel;
		symmetryType = _symmetryType;
		smallValues = true;
		int k, ksize = kernel.rows + kernel.cols - 1;
		for ( k = 0; k < ksize; k++ ) {
			int v = ((const int*)kernel.data)[k];
			if ( v < SHRT_MIN || v > SHRT_MAX ) {
				smallValues = false;
				break;
			}
		}
	}

	int operator()(const uchar* src, uchar* _dst, int width, int cn) const {
		if ( !checkHardwareSupport(CV_CPU_SSE2) ) {
			return 0;
		}

		int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
		int* dst = (int*)_dst;
		bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
		const int* kx = (const int*)kernel.data + _ksize / 2;
		if ( !smallValues ) {
			return 0;
		}

		src += (_ksize / 2) * cn;
		width *= cn;

		__m128i z = _mm_setzero_si128();
		if ( symmetrical ) {
			if ( _ksize == 1 ) {
				return 0;
			}
			if ( _ksize == 3 ) {
				if ( kx[0] == 2 && kx[1] == 1 )
					for ( ; i <= width - 16; i += 16, src += 16 ) {
						__m128i x0, x1, x2, y0, y1, y2;
						x0 = _mm_loadu_si128((__m128i*)(src - cn));
						x1 = _mm_loadu_si128((__m128i*)src);
						x2 = _mm_loadu_si128((__m128i*)(src + cn));
						y0 = _mm_unpackhi_epi8(x0, z);
						x0 = _mm_unpacklo_epi8(x0, z);
						y1 = _mm_unpackhi_epi8(x1, z);
						x1 = _mm_unpacklo_epi8(x1, z);
						y2 = _mm_unpackhi_epi8(x2, z);
						x2 = _mm_unpacklo_epi8(x2, z);
						x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
						y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
						_mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
						_mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
						_mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
						_mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
					}
				else if ( kx[0] == -2 && kx[1] == 1 )
					for ( ; i <= width - 16; i += 16, src += 16 ) {
						__m128i x0, x1, x2, y0, y1, y2;
						x0 = _mm_loadu_si128((__m128i*)(src - cn));
						x1 = _mm_loadu_si128((__m128i*)src);
						x2 = _mm_loadu_si128((__m128i*)(src + cn));
						y0 = _mm_unpackhi_epi8(x0, z);
						x0 = _mm_unpacklo_epi8(x0, z);
						y1 = _mm_unpackhi_epi8(x1, z);
						x1 = _mm_unpacklo_epi8(x1, z);
						y2 = _mm_unpackhi_epi8(x2, z);
						x2 = _mm_unpacklo_epi8(x2, z);
						x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
						y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
						_mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0), 16));
						_mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0), 16));
						_mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0), 16));
						_mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0), 16));
					}
				else {
					__m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
							k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
					k0 = _mm_packs_epi32(k0, k0);
					k1 = _mm_packs_epi32(k1, k1);

					for ( ; i <= width - 16; i += 16, src += 16 ) {
						__m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
						x0 = _mm_loadu_si128((__m128i*)(src - cn));
						x1 = _mm_loadu_si128((__m128i*)src);
						x2 = _mm_loadu_si128((__m128i*)(src + cn));
						y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
						x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
						y1 = _mm_unpackhi_epi8(x1, z);
						x1 = _mm_unpacklo_epi8(x1, z);

						t1 = _mm_mulhi_epi16(x1, k0);
						t0 = _mm_mullo_epi16(x1, k0);
						x2 = _mm_mulhi_epi16(x0, k1);
						x0 = _mm_mullo_epi16(x0, k1);
						z0 = _mm_unpacklo_epi16(t0, t1);
						z1 = _mm_unpackhi_epi16(t0, t1);
						z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
						z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));

						t1 = _mm_mulhi_epi16(y1, k0);
						t0 = _mm_mullo_epi16(y1, k0);
						y1 = _mm_mulhi_epi16(y0, k1);
						y0 = _mm_mullo_epi16(y0, k1);
						z2 = _mm_unpacklo_epi16(t0, t1);
						z3 = _mm_unpackhi_epi16(t0, t1);
						z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
						z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
						_mm_store_si128((__m128i*)(dst + i), z0);
						_mm_store_si128((__m128i*)(dst + i + 4), z1);
						_mm_store_si128((__m128i*)(dst + i + 8), z2);
						_mm_store_si128((__m128i*)(dst + i + 12), z3);
					}
				}
			} else if ( _ksize == 5 ) {
				if ( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
					for ( ; i <= width - 16; i += 16, src += 16 ) {
						__m128i x0, x1, x2, y0, y1, y2;
						x0 = _mm_loadu_si128((__m128i*)(src - cn * 2));
						x1 = _mm_loadu_si128((__m128i*)src);
						x2 = _mm_loadu_si128((__m128i*)(src + cn * 2));
						y0 = _mm_unpackhi_epi8(x0, z);
						x0 = _mm_unpacklo_epi8(x0, z);
						y1 = _mm_unpackhi_epi8(x1, z);
						x1 = _mm_unpacklo_epi8(x1, z);
						y2 = _mm_unpackhi_epi8(x2, z);
						x2 = _mm_unpacklo_epi8(x2, z);
						x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
						y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
						_mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0), 16));
						_mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0), 16));
						_mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0), 16));
						_mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0), 16));
					}
				else {
					__m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
							k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
							k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
					k0 = _mm_packs_epi32(k0, k0);
					k1 = _mm_packs_epi32(k1, k1);
					k2 = _mm_packs_epi32(k2, k2);

					for ( ; i <= width - 16; i += 16, src += 16 ) {
						__m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
						x0 = _mm_loadu_si128((__m128i*)(src - cn));
						x1 = _mm_loadu_si128((__m128i*)src);
						x2 = _mm_loadu_si128((__m128i*)(src + cn));
						y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
						x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
						y1 = _mm_unpackhi_epi8(x1, z);
						x1 = _mm_unpacklo_epi8(x1, z);

						t1 = _mm_mulhi_epi16(x1, k0);
						t0 = _mm_mullo_epi16(x1, k0);
						x2 = _mm_mulhi_epi16(x0, k1);
						x0 = _mm_mullo_epi16(x0, k1);
						z0 = _mm_unpacklo_epi16(t0, t1);
						z1 = _mm_unpackhi_epi16(t0, t1);
						z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
						z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));

						t1 = _mm_mulhi_epi16(y1, k0);
						t0 = _mm_mullo_epi16(y1, k0);
						y1 = _mm_mulhi_epi16(y0, k1);
						y0 = _mm_mullo_epi16(y0, k1);
						z2 = _mm_unpacklo_epi16(t0, t1);
						z3 = _mm_unpackhi_epi16(t0, t1);
						z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
						z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));

						x0 = _mm_loadu_si128((__m128i*)(src - cn * 2));
						x1 = _mm_loadu_si128((__m128i*)(src + cn * 2));
						y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
						y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));

						t1 = _mm_mulhi_epi16(y0, k2);
						t0 = _mm_mullo_epi16(y0, k2);
						y0 = _mm_mullo_epi16(y1, k2);
						y1 = _mm_mulhi_epi16(y1, k2);
						z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
						z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
						z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
						z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));

						_mm_store_si128((__m128i*)(dst + i), z0);
						_mm_store_si128((__m128i*)(dst + i + 4), z1);
						_mm_store_si128((__m128i*)(dst + i + 8), z2);
						_mm_store_si128((__m128i*)(dst + i + 12), z3);
					}
				}
			}
		} else {
			if ( _ksize == 3 ) {
				if ( kx[0] == 0 && kx[1] == 1 )
					for ( ; i <= width - 16; i += 16, src += 16 ) {
						__m128i x0, x1, y0;
						x0 = _mm_loadu_si128((__m128i*)(src + cn));
						x1 = _mm_loadu_si128((__m128i*)(src - cn));
						y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
						x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
						_mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0), 16));
						_mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0), 16));
						_mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0), 16));
						_mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0), 16));
					}
				else {
					__m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
					k1 = _mm_packs_epi32(k1, k1);

					for ( ; i <= width - 16; i += 16, src += 16 ) {
						__m128i x0, x1, y0, y1, z0, z1, z2, z3;
						x0 = _mm_loadu_si128((__m128i*)(src + cn));
						x1 = _mm_loadu_si128((__m128i*)(src - cn));
						y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
						x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));

						x1 = _mm_mulhi_epi16(x0, k1);
						x0 = _mm_mullo_epi16(x0, k1);
						z0 = _mm_unpacklo_epi16(x0, x1);
						z1 = _mm_unpackhi_epi16(x0, x1);

						y1 = _mm_mulhi_epi16(y0, k1);
						y0 = _mm_mullo_epi16(y0, k1);
						z2 = _mm_unpacklo_epi16(y0, y1);
						z3 = _mm_unpackhi_epi16(y0, y1);
						_mm_store_si128((__m128i*)(dst + i), z0);
						_mm_store_si128((__m128i*)(dst + i + 4), z1);
						_mm_store_si128((__m128i*)(dst + i + 8), z2);
						_mm_store_si128((__m128i*)(dst + i + 12), z3);
					}
				}
			} else if ( _ksize == 5 ) {
				__m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
						k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
						k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
				k0 = _mm_packs_epi32(k0, k0);
				k1 = _mm_packs_epi32(k1, k1);
				k2 = _mm_packs_epi32(k2, k2);

				for ( ; i <= width - 16; i += 16, src += 16 ) {
					__m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
					x0 = _mm_loadu_si128((__m128i*)(src + cn));
					x2 = _mm_loadu_si128((__m128i*)(src - cn));
					y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
					x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));

					x2 = _mm_mulhi_epi16(x0, k1);
					x0 = _mm_mullo_epi16(x0, k1);
					z0 = _mm_unpacklo_epi16(x0, x2);
					z1 = _mm_unpackhi_epi16(x0, x2);
					y1 = _mm_mulhi_epi16(y0, k1);
					y0 = _mm_mullo_epi16(y0, k1);
					z2 = _mm_unpacklo_epi16(y0, y1);
					z3 = _mm_unpackhi_epi16(y0, y1);

					x0 = _mm_loadu_si128((__m128i*)(src + cn * 2));
					x1 = _mm_loadu_si128((__m128i*)(src - cn * 2));
					y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
					y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));

					t1 = _mm_mulhi_epi16(y0, k2);
					t0 = _mm_mullo_epi16(y0, k2);
					y0 = _mm_mullo_epi16(y1, k2);
					y1 = _mm_mulhi_epi16(y1, k2);
					z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
					z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
					z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
					z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));

					_mm_store_si128((__m128i*)(dst + i), z0);
					_mm_store_si128((__m128i*)(dst + i + 4), z1);
					_mm_store_si128((__m128i*)(dst + i + 8), z2);
					_mm_store_si128((__m128i*)(dst + i + 12), z3);
				}
			}
		}

		src -= (_ksize / 2) * cn;
		kx -= _ksize / 2;
		for ( ; i <= width - 4; i += 4, src += 4 ) {
			__m128i f, s0 = z, x0, x1;

			for ( k = j = 0; k < _ksize; k++, j += cn ) {
				f = _mm_cvtsi32_si128(kx[k]);
				f = _mm_shuffle_epi32(f, 0);
				f = _mm_packs_epi32(f, f);

				x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
				x0 = _mm_unpacklo_epi8(x0, z);
				x1 = _mm_mulhi_epi16(x0, f);
				x0 = _mm_mullo_epi16(x0, f);
				s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
			}
			_mm_store_si128((__m128i*)(dst + i), s0);
		}

		return i;
	}

	Mat kernel;
	int symmetryType;
	bool smallValues;
};


struct SymmColumnVec_32s8u {
	SymmColumnVec_32s8u() { symmetryType = 0; }
	SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta) {
		symmetryType = _symmetryType;
		_kernel.convertTo(kernel, CV_32F, 1. / (1 << _bits), 0);
		delta = (float)(_delta / (1 << _bits));
		CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
	}

	int operator()(const uchar** _src, uchar* dst, int width) const {
		if ( !checkHardwareSupport(CV_CPU_SSE2) ) {
			return 0;
		}

		int ksize2 = (kernel.rows + kernel.cols - 1) / 2;
		const float* ky = (const float*)kernel.data + ksize2;
		int i = 0, k;
		bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
		const int** src = (const int**)_src;
		const __m128i* S, *S2;
		__m128 d4 = _mm_set1_ps(delta);

		if ( symmetrical ) {
			for ( ; i <= width - 16; i += 16 ) {
				__m128 f = _mm_load_ss(ky);
				f = _mm_shuffle_ps(f, f, 0);
				__m128 s0, s1, s2, s3;
				__m128i x0, x1;
				S = (const __m128i*)(src[0] + i);
				s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
				s1 = _mm_cvtepi32_ps(_mm_load_si128(S + 1));
				s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
				s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
				s2 = _mm_cvtepi32_ps(_mm_load_si128(S + 2));
				s3 = _mm_cvtepi32_ps(_mm_load_si128(S + 3));
				s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
				s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);

				for ( k = 1; k <= ksize2; k++ ) {
					S = (const __m128i*)(src[k] + i);
					S2 = (const __m128i*)(src[-k] + i);
					f = _mm_load_ss(ky + k);
					f = _mm_shuffle_ps(f, f, 0);
					x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
					x1 = _mm_add_epi32(_mm_load_si128(S + 1), _mm_load_si128(S2 + 1));
					s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
					s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
					x0 = _mm_add_epi32(_mm_load_si128(S + 2), _mm_load_si128(S2 + 2));
					x1 = _mm_add_epi32(_mm_load_si128(S + 3), _mm_load_si128(S2 + 3));
					s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
					s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
				}

				x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
				x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
				x0 = _mm_packus_epi16(x0, x1);
				_mm_storeu_si128((__m128i*)(dst + i), x0);
			}

			for ( ; i <= width - 4; i += 4 ) {
				__m128 f = _mm_load_ss(ky);
				f = _mm_shuffle_ps(f, f, 0);
				__m128i x0;
				__m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
				s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);

				for ( k = 1; k <= ksize2; k++ ) {
					S = (const __m128i*)(src[k] + i);
					S2 = (const __m128i*)(src[-k] + i);
					f = _mm_load_ss(ky + k);
					f = _mm_shuffle_ps(f, f, 0);
					x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
					s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
				}

				x0 = _mm_cvtps_epi32(s0);
				x0 = _mm_packs_epi32(x0, x0);
				x0 = _mm_packus_epi16(x0, x0);
				*(int*)(dst + i) = _mm_cvtsi128_si32(x0);
			}
		} else {
			for ( ; i <= width - 16; i += 16 ) {
				__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
				__m128i x0, x1;

				for ( k = 1; k <= ksize2; k++ ) {
					S = (const __m128i*)(src[k] + i);
					S2 = (const __m128i*)(src[-k] + i);
					f = _mm_load_ss(ky + k);
					f = _mm_shuffle_ps(f, f, 0);
					x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
					x1 = _mm_sub_epi32(_mm_load_si128(S + 1), _mm_load_si128(S2 + 1));
					s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
					s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
					x0 = _mm_sub_epi32(_mm_load_si128(S + 2), _mm_load_si128(S2 + 2));
					x1 = _mm_sub_epi32(_mm_load_si128(S + 3), _mm_load_si128(S2 + 3));
					s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
					s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
				}

				x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
				x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
				x0 = _mm_packus_epi16(x0, x1);
				_mm_storeu_si128((__m128i*)(dst + i), x0);
			}

			for ( ; i <= width - 4; i += 4 ) {
				__m128 f, s0 = d4;
				__m128i x0;

				for ( k = 1; k <= ksize2; k++ ) {
					S = (const __m128i*)(src[k] + i);
					S2 = (const __m128i*)(src[-k] + i);
					f = _mm_load_ss(ky + k);
					f = _mm_shuffle_ps(f, f, 0);
					x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
					s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
				}

				x0 = _mm_cvtps_epi32(s0);
				x0 = _mm_packs_epi32(x0, x0);
				x0 = _mm_packus_epi16(x0, x0);
				*(int*)(dst + i) = _mm_cvtsi128_si32(x0);
			}
		}

		return i;
	}

	int symmetryType;
	float delta;
	Mat kernel;
};


struct SymmColumnSmallVec_32s16s {
	SymmColumnSmallVec_32s16s() { symmetryType = 0; }
	SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta) {
		symmetryType = _symmetryType;
		_kernel.convertTo(kernel, CV_32F, 1. / (1 << _bits), 0);
		delta = (float)(_delta / (1 << _bits));
		CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
	}

	int operator()(const uchar** _src, uchar* _dst, int width) const {
		if ( !checkHardwareSupport(CV_CPU_SSE2) ) {
			return 0;
		}

		int ksize2 = (kernel.rows + kernel.cols - 1) / 2;
		const float* ky = (const float*)kernel.data + ksize2;
		int i = 0;
		bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
		const int** src = (const int**)_src;
		const int* S0 = src[-1], *S1 = src[0], *S2 = src[1];
		short* dst = (short*)_dst;
		__m128 df4 = _mm_set1_ps(delta);
		__m128i d4 = _mm_cvtps_epi32(df4);

		if ( symmetrical ) {
			if ( ky[0] == 2 && ky[1] == 1 ) {
				for ( ; i <= width - 8; i += 8 ) {
					__m128i s0, s1, s2, s3, s4, s5;
					s0 = _mm_load_si128((__m128i*)(S0 + i));
					s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
					s2 = _mm_load_si128((__m128i*)(S1 + i));
					s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
					s4 = _mm_load_si128((__m128i*)(S2 + i));
					s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
					s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
					s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
					s0 = _mm_add_epi32(s0, d4);
					s1 = _mm_add_epi32(s1, d4);
					_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
				}
			} else if ( ky[0] == -2 && ky[1] == 1 ) {
				for ( ; i <= width - 8; i += 8 ) {
					__m128i s0, s1, s2, s3, s4, s5;
					s0 = _mm_load_si128((__m128i*)(S0 + i));
					s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
					s2 = _mm_load_si128((__m128i*)(S1 + i));
					s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
					s4 = _mm_load_si128((__m128i*)(S2 + i));
					s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
					s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
					s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
					s0 = _mm_add_epi32(s0, d4);
					s1 = _mm_add_epi32(s1, d4);
					_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
				}
			} else {
				__m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
				for ( ; i <= width - 8; i += 8 ) {
					__m128 s0, s1;
					s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
					s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
					s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
					s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
					__m128i x0, x1;
					x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
									   _mm_load_si128((__m128i*)(S2 + i)));
					x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
									   _mm_load_si128((__m128i*)(S2 + i + 4)));
					s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), k1));
					s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), k1));
					x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
					_mm_storeu_si128((__m128i*)(dst + i), x0);
				}
			}
		} else {
			if ( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) {
				if ( ky[1] < 0 ) {
					std::swap(S0, S2);
				}
				for ( ; i <= width - 8; i += 8 ) {
					__m128i s0, s1, s2, s3;
					s0 = _mm_load_si128((__m128i*)(S2 + i));
					s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
					s2 = _mm_load_si128((__m128i*)(S0 + i));
					s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
					s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
					s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
					_mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
				}
			} else {
				__m128 k1 = _mm_set1_ps(ky[1]);
				for ( ; i <= width - 8; i += 8 ) {
					__m128 s0 = df4, s1 = df4;
					__m128i x0, x1;
					x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i)),
									   _mm_load_si128((__m128i*)(S2 + i)));
					x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
									   _mm_load_si128((__m128i*)(S2 + i + 4)));
					s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), k1));
					s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), k1));
					x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
					_mm_storeu_si128((__m128i*)(dst + i), x0);
				}
			}
		}

		return i;
	}

	int symmetryType;
	float delta;
	Mat kernel;
};


/////////////////////////////////////// 32f //////////////////////////////////

struct RowVec_32f {
	RowVec_32f() {}
	RowVec_32f( const Mat& _kernel ) {
		kernel = _kernel;
	}

	int operator()(const uchar* _src, uchar* _dst, int width, int cn) const {
		if ( !checkHardwareSupport(CV_CPU_SSE) ) {
			return 0;
		}

		int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
		float* dst = (float*)_dst;
		const float* _kx = (const float*)kernel.data;
		width *= cn;

		for ( ; i <= width - 8; i += 8 ) {
			const float* src = (const float*)_src + i;
			__m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
			for ( k = 0; k < _ksize; k++, src += cn ) {
				f = _mm_load_ss(_kx + k);
				f = _mm_shuffle_ps(f, f, 0);

				x0 = _mm_loadu_ps(src);
				x1 = _mm_loadu_ps(src + 4);
				s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
				s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
			}
			_mm_store_ps(dst + i, s0);
			_mm_store_ps(dst + i + 4, s1);
		}
		return i;
	}

	Mat kernel;
};


struct SymmRowSmallVec_32f {
	SymmRowSmallVec_32f() {}
	SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType ) {
		kernel = _kernel;
		symmetryType = _symmetryType;
	}

	int operator()(const uchar* _src, uchar* _dst, int width, int cn) const {
		if ( !checkHardwareSupport(CV_CPU_SSE) ) {
			return 0;
		}

		int i = 0, _ksize = kernel.rows + kernel.cols - 1;
		float* dst = (float*)_dst;
		const float* src = (const float*)_src + (_ksize / 2) * cn;
		bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
		const float* kx = (const float*)kernel.data + _ksize / 2;
		width *= cn;

		if ( symmetrical ) {
			if ( _ksize == 1 ) {
				return 0;
			}
			if ( _ksize == 3 ) {
				if ( kx[0] == 2 && kx[1] == 1 )
					for ( ; i <= width - 8; i += 8, src += 8 ) {
						__m128 x0, x1, x2, y0, y1, y2;
						x0 = _mm_loadu_ps(src - cn);
						x1 = _mm_loadu_ps(src);
						x2 = _mm_loadu_ps(src + cn);
						y0 = _mm_loadu_ps(src - cn + 4);
						y1 = _mm_loadu_ps(src + 4);
						y2 = _mm_loadu_ps(src + cn + 4);
						x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
						y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
						_mm_store_ps(dst + i, x0);
						_mm_store_ps(dst + i + 4, y0);
					}
				else if ( kx[0] == -2 && kx[1] == 1 )
					for ( ; i <= width - 8; i += 8, src += 8 ) {
						__m128 x0, x1, x2, y0, y1, y2;
						x0 = _mm_loadu_ps(src - cn);
						x1 = _mm_loadu_ps(src);
						x2 = _mm_loadu_ps(src + cn);
						y0 = _mm_loadu_ps(src - cn + 4);
						y1 = _mm_loadu_ps(src + 4);
						y2 = _mm_loadu_ps(src + cn + 4);
						x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
						y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
						_mm_store_ps(dst + i, x0);
						_mm_store_ps(dst + i + 4, y0);
					}
				else {
					__m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
					for ( ; i <= width - 8; i += 8, src += 8 ) {
						__m128 x0, x1, x2, y0, y1, y2;
						x0 = _mm_loadu_ps(src - cn);
						x1 = _mm_loadu_ps(src);
						x2 = _mm_loadu_ps(src + cn);
						y0 = _mm_loadu_ps(src - cn + 4);
						y1 = _mm_loadu_ps(src + 4);
						y2 = _mm_loadu_ps(src + cn + 4);

						x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
						y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
						x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
						y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
						_mm_store_ps(dst + i, x0);
						_mm_store_ps(dst + i + 4, y0);
					}
				}
			} else if ( _ksize == 5 ) {
				if ( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
					for ( ; i <= width - 8; i += 8, src += 8 ) {
						__m128 x0, x1, x2, y0, y1, y2;
						x0 = _mm_loadu_ps(src - cn * 2);
						x1 = _mm_loadu_ps(src);
						x2 = _mm_loadu_ps(src + cn * 2);
						y0 = _mm_loadu_ps(src - cn * 2 + 4);
						y1 = _mm_loadu_ps(src + 4);
						y2 = _mm_loadu_ps(src + cn * 2 + 4);
						x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
						y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
						_mm_store_ps(dst + i, x0);
						_mm_store_ps(dst + i + 4, y0);
					}
				else {
					__m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
					for ( ; i <= width - 8; i += 8, src += 8 ) {
						__m128 x0, x1, x2, y0, y1, y2;
						x0 = _mm_loadu_ps(src - cn);
						x1 = _mm_loadu_ps(src);
						x2 = _mm_loadu_ps(src + cn);
						y0 = _mm_loadu_ps(src - cn + 4);
						y1 = _mm_loadu_ps(src + 4);
						y2 = _mm_loadu_ps(src + cn + 4);

						x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
						y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
						x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
						y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));

						x2 = _mm_add_ps(_mm_loadu_ps(src + cn * 2), _mm_loadu_ps(src - cn * 2));
						y2 = _mm_add_ps(_mm_loadu_ps(src + cn * 2 + 4), _mm_loadu_ps(src - cn * 2 + 4));
						x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
						y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));

						_mm_store_ps(dst + i, x0);
						_mm_store_ps(dst + i + 4, y0);
					}
				}
			}
		} else {
			if ( _ksize == 3 ) {
				if ( kx[0] == 0 && kx[1] == 1 )
					for ( ; i <= width - 8; i += 8, src += 8 ) {
						__m128 x0, x2, y0, y2;
						x0 = _mm_loadu_ps(src + cn);
						x2 = _mm_loadu_ps(src - cn);
						y0 = _mm_loadu_ps(src + cn + 4);
						y2 = _mm_loadu_ps(src - cn + 4);
						x0 = _mm_sub_ps(x0, x2);
						y0 = _mm_sub_ps(y0, y2);
						_mm_store_ps(dst + i, x0);
						_mm_store_ps(dst + i + 4, y0);
					}
				else {
					__m128 k1 = _mm_set1_ps(kx[1]);
					for ( ; i <= width - 8; i += 8, src += 8 ) {
						__m128 x0, x2, y0, y2;
						x0 = _mm_loadu_ps(src + cn);
						x2 = _mm_loadu_ps(src - cn);
						y0 = _mm_loadu_ps(src + cn + 4);
						y2 = _mm_loadu_ps(src - cn + 4);

						x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
						y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
						_mm_store_ps(dst + i, x0);
						_mm_store_ps(dst + i + 4, y0);
					}
				}
			} else if ( _ksize == 5 ) {
				__m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
				for ( ; i <= width - 8; i += 8, src += 8 ) {
					__m128 x0, x2, y0, y2;
					x0 = _mm_loadu_ps(src + cn);
					x2 = _mm_loadu_ps(src - cn);
					y0 = _mm_loadu_ps(src + cn + 4);
					y2 = _mm_loadu_ps(src - cn + 4);

					x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
					y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);

					x2 = _mm_sub_ps(_mm_loadu_ps(src + cn * 2), _mm_loadu_ps(src - cn * 2));
					y2 = _mm_sub_ps(_mm_loadu_ps(src + cn * 2 + 4), _mm_loadu_ps(src - cn * 2 + 4));
					x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
					y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));

					_mm_store_ps(dst + i, x0);
					_mm_store_ps(dst + i + 4, y0);
				}
			}
		}

		return i;
	}

	Mat kernel;
	int symmetryType;
};


struct SymmColumnVec_32f {
	SymmColumnVec_32f() { symmetryType = 0; }
	SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) {
		symmetryType = _symmetryType;
		kernel = _kernel;
		delta = (float)_delta;
		CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
	}

	int operator()(const uchar** _src, uchar* _dst, int width) const {
		if ( !checkHardwareSupport(CV_CPU_SSE) ) {
			return 0;
		}

		int ksize2 = (kernel.rows + kernel.cols - 1) / 2;
		const float* ky = (const float*)kernel.data + ksize2;
		int i = 0, k;
		bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
		const float** src = (const float**)_src;
		const float* S, *S2;
		float* dst = (float*)_dst;
		__m128 d4 = _mm_set1_ps(delta);

		if ( symmetrical ) {
			for ( ; i <= width - 16; i += 16 ) {
				__m128 f = _mm_load_ss(ky);
				f = _mm_shuffle_ps(f, f, 0);
				__m128 s0, s1, s2, s3;
				__m128 x0, x1;
				S = src[0] + i;
				s0 = _mm_load_ps(S);
				s1 = _mm_load_ps(S + 4);
				s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
				s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
				s2 = _mm_load_ps(S + 8);
				s3 = _mm_load_ps(S + 12);
				s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
				s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);

				for ( k = 1; k <= ksize2; k++ ) {
					S = src[k] + i;
					S2 = src[-k] + i;
					f = _mm_load_ss(ky + k);
					f = _mm_shuffle_ps(f, f, 0);
					x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
					x1 = _mm_add_ps(_mm_load_ps(S + 4), _mm_load_ps(S2 + 4));
					s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
					s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
					x0 = _mm_add_ps(_mm_load_ps(S + 8), _mm_load_ps(S2 + 8));
					x1 = _mm_add_ps(_mm_load_ps(S + 12), _mm_load_ps(S2 + 12));
					s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
					s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
				}

				_mm_storeu_ps(dst + i, s0);
				_mm_storeu_ps(dst + i + 4, s1);
				_mm_storeu_ps(dst + i + 8, s2);
				_mm_storeu_ps(dst + i + 12, s3);
			}

			for ( ; i <= width - 4; i += 4 ) {
				__m128 f = _mm_load_ss(ky);
				f = _mm_shuffle_ps(f, f, 0);
				__m128 x0, s0 = _mm_load_ps(src[0] + i);
				s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);

				for ( k = 1; k <= ksize2; k++ ) {
					f = _mm_load_ss(ky + k);
					f = _mm_shuffle_ps(f, f, 0);
					S = src[k] + i;
					S2 = src[-k] + i;
					x0 = _mm_add_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i));
					s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
				}

				_mm_storeu_ps(dst + i, s0);
			}
		} else {
			for ( ; i <= width - 16; i += 16 ) {
				__m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
				__m128 x0, x1;
				S = src[0] + i;

				for ( k = 1; k <= ksize2; k++ ) {
					S = src[k] + i;
					S2 = src[-k] + i;
					f = _mm_load_ss(ky + k);
					f = _mm_shuffle_ps(f, f, 0);
					x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
					x1 = _mm_sub_ps(_mm_load_ps(S + 4), _mm_load_ps(S2 + 4));
					s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
					s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
					x0 = _mm_sub_ps(_mm_load_ps(S + 8), _mm_load_ps(S2 + 8));
					x1 = _mm_sub_ps(_mm_load_ps(S + 12), _mm_load_ps(S2 + 12));
					s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
					s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
				}

				_mm_storeu_ps(dst + i, s0);
				_mm_storeu_ps(dst + i + 4, s1);
				_mm_storeu_ps(dst + i + 8, s2);
				_mm_storeu_ps(dst + i + 12, s3);
			}

			for ( ; i <= width - 4; i += 4 ) {
				__m128 f, x0, s0 = d4;

				for ( k = 1; k <= ksize2; k++ ) {
					f = _mm_load_ss(ky + k);
					f = _mm_shuffle_ps(f, f, 0);
					S = src[k] + i;
					S2 = src[-k] + i;
					x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i));
					s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
				}

				_mm_storeu_ps(dst + i, s0);
			}
		}

		return i;
	}

	int symmetryType;
	float delta;
	Mat kernel;
};


struct SymmColumnSmallVec_32f {
	SymmColumnSmallVec_32f() { symmetryType = 0; }
	SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) {
		symmetryType = _symmetryType;
		kernel = _kernel;
		delta = (float)_delta;
		CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
	}

	int operator()(const uchar** _src, uchar* _dst, int width) const {
		if ( !checkHardwareSupport(CV_CPU_SSE) ) {
			return 0;
		}

		int ksize2 = (kernel.rows + kernel.cols - 1) / 2;
		const float* ky = (const float*)kernel.data + ksize2;
		int i = 0;
		bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
		const float** src = (const float**)_src;
		const float* S0 = src[-1], *S1 = src[0], *S2 = src[1];
		float* dst = (float*)_dst;
		__m128 d4 = _mm_set1_ps(delta);

		if ( symmetrical ) {
			if ( ky[0] == 2 && ky[1] == 1 ) {
				for ( ; i <= width - 8; i += 8 ) {
					__m128 s0, s1, s2, s3, s4, s5;
					s0 = _mm_load_ps(S0 + i);
					s1 = _mm_load_ps(S0 + i + 4);
					s2 = _mm_load_ps(S1 + i);
					s3 = _mm_load_ps(S1 + i + 4);
					s4 = _mm_load_ps(S2 + i);
					s5 = _mm_load_ps(S2 + i + 4);
					s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
					s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
					s0 = _mm_add_ps(s0, d4);
					s1 = _mm_add_ps(s1, d4);
					_mm_storeu_ps(dst + i, s0);
					_mm_storeu_ps(dst + i + 4, s1);
				}
			} else if ( ky[0] == -2 && ky[1] == 1 ) {
				for ( ; i <= width - 8; i += 8 ) {
					__m128 s0, s1, s2, s3, s4, s5;
					s0 = _mm_load_ps(S0 + i);
					s1 = _mm_load_ps(S0 + i + 4);
					s2 = _mm_load_ps(S1 + i);
					s3 = _mm_load_ps(S1 + i + 4);
					s4 = _mm_load_ps(S2 + i);
					s5 = _mm_load_ps(S2 + i + 4);
					s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
					s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
					s0 = _mm_add_ps(s0, d4);
					s1 = _mm_add_ps(s1, d4);
					_mm_storeu_ps(dst + i, s0);
					_mm_storeu_ps(dst + i + 4, s1);
				}
			} else {
				__m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
				for ( ; i <= width - 8; i += 8 ) {
					__m128 s0, s1, x0, x1;
					s0 = _mm_load_ps(S1 + i);
					s1 = _mm_load_ps(S1 + i + 4);
					s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
					s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
					x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
					x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
					s0 = _mm_add_ps(s0, _mm_mul_ps(x0, k1));
					s1 = _mm_add_ps(s1, _mm_mul_ps(x1, k1));
					_mm_storeu_ps(dst + i, s0);
					_mm_storeu_ps(dst + i + 4, s1);
				}
			}
		} else {
			if ( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) {
				if ( ky[1] < 0 ) {
					std::swap(S0, S2);
				}
				for ( ; i <= width - 8; i += 8 ) {
					__m128 s0, s1, s2, s3;
					s0 = _mm_load_ps(S2 + i);
					s1 = _mm_load_ps(S2 + i + 4);
					s2 = _mm_load_ps(S0 + i);
					s3 = _mm_load_ps(S0 + i + 4);
					s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
					s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
					_mm_storeu_ps(dst + i, s0);
					_mm_storeu_ps(dst + i + 4, s1);
				}
			} else {
				__m128 k1 = _mm_set1_ps(ky[1]);
				for ( ; i <= width - 8; i += 8 ) {
					__m128 s0 = d4, s1 = d4, x0, x1;
					x0 = _mm_sub_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
					x1 = _mm_sub_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
					s0 = _mm_add_ps(s0, _mm_mul_ps(x0, k1));
					s1 = _mm_add_ps(s1, _mm_mul_ps(x1, k1));
					_mm_storeu_ps(dst + i, s0);
					_mm_storeu_ps(dst + i + 4, s1);
				}
			}
		}

		return i;
	}

	int symmetryType;
	float delta;
	Mat kernel;
};


/////////////////////////////// non-separable filters ///////////////////////////////

///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////

struct FilterVec_8u {
	FilterVec_8u() {}
	FilterVec_8u(const Mat& _kernel, int _bits, double _delta) {
		Mat kernel;
		_kernel.convertTo(kernel, CV_32F, 1. / (1 << _bits), 0);
		delta = (float)(_delta / (1 << _bits));
		vector<Point> coords;
		preprocess2DKernel(kernel, coords, coeffs);
		_nz = (int)coords.size();
	}

	int operator()(const uchar** src, uchar* dst, int width) const {
		if ( !checkHardwareSupport(CV_CPU_SSE2) ) {
			return 0;
		}

		const float* kf = (const float*)&coeffs[0];
		int i = 0, k, nz = _nz;
		__m128 d4 = _mm_set1_ps(delta);

		for ( ; i <= width - 16; i += 16 ) {
			__m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
			__m128i x0, x1, z = _mm_setzero_si128();

			for ( k = 0; k < nz; k++ ) {
				__m128 f = _mm_load_ss(kf + k), t0, t1;
				f = _mm_shuffle_ps(f, f, 0);

				x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
				x1 = _mm_unpackhi_epi8(x0, z);
				x0 = _mm_unpacklo_epi8(x0, z);

				t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
				t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
				s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
				s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));

				t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
				t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
				s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
				s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
			}

			x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
			x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
			x0 = _mm_packus_epi16(x0, x1);
			_mm_storeu_si128((__m128i*)(dst + i), x0);
		}

		for ( ; i <= width - 4; i += 4 ) {
			__m128 s0 = d4;
			__m128i x0, z = _mm_setzero_si128();

			for ( k = 0; k < nz; k++ ) {
				__m128 f = _mm_load_ss(kf + k), t0;
				f = _mm_shuffle_ps(f, f, 0);

				x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
				x0 = _mm_unpacklo_epi8(x0, z);
				t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
				s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
			}

			x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
			x0 = _mm_packus_epi16(x0, x0);
			*(int*)(dst + i) = _mm_cvtsi128_si32(x0);
		}

		return i;
	}

	int _nz;
	vector<uchar> coeffs;
	float delta;
};


struct FilterVec_8u16s {
	FilterVec_8u16s() {}
	FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta) {
		Mat kernel;
		_kernel.convertTo(kernel, CV_32F, 1. / (1 << _bits), 0);
		delta = (float)(_delta / (1 << _bits));
		vector<Point> coords;
		preprocess2DKernel(kernel, coords, coeffs);
		_nz = (int)coords.size();
	}

	int operator()(const uchar** src, uchar* _dst, int width) const {
		if ( !checkHardwareSupport(CV_CPU_SSE2) ) {
			return 0;
		}

		const float* kf = (const float*)&coeffs[0];
		short* dst = (short*)_dst;
		int i = 0, k, nz = _nz;
		__m128 d4 = _mm_set1_ps(delta);

		for ( ; i <= width - 16; i += 16 ) {
			__m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
			__m128i x0, x1, z = _mm_setzero_si128();

			for ( k = 0; k < nz; k++ ) {
				__m128 f = _mm_load_ss(kf + k), t0, t1;
				f = _mm_shuffle_ps(f, f, 0);

				x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
				x1 = _mm_unpackhi_epi8(x0, z);
				x0 = _mm_unpacklo_epi8(x0, z);

				t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
				t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
				s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
				s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));

				t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
				t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
				s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
				s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
			}

			x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
			x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
			_mm_storeu_si128((__m128i*)(dst + i), x0);
			_mm_storeu_si128((__m128i*)(dst + i + 8), x1);
		}

		for ( ; i <= width - 4; i += 4 ) {
			__m128 s0 = d4;
			__m128i x0, z = _mm_setzero_si128();

			for ( k = 0; k < nz; k++ ) {
				__m128 f = _mm_load_ss(kf + k), t0;
				f = _mm_shuffle_ps(f, f, 0);

				x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
				x0 = _mm_unpacklo_epi8(x0, z);
				t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
				s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
			}

			x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
			_mm_storel_epi64((__m128i*)(dst + i), x0);
		}

		return i;
	}

	int _nz;
	vector<uchar> coeffs;
	float delta;
};


struct FilterVec_32f {
	FilterVec_32f() {}
	FilterVec_32f(const Mat& _kernel, int, double _delta) {
		delta = (float)_delta;
		vector<Point> coords;
		preprocess2DKernel(_kernel, coords, coeffs);
		_nz = (int)coords.size();
	}

	int operator()(const uchar** _src, uchar* _dst, int width) const {
		if ( !checkHardwareSupport(CV_CPU_SSE) ) {
			return 0;
		}

		const float* kf = (const float*)&coeffs[0];
		const float** src = (const float**)_src;
		float* dst = (float*)_dst;
		int i = 0, k, nz = _nz;
		__m128 d4 = _mm_set1_ps(delta);

		for ( ; i <= width - 16; i += 16 ) {
			__m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;

			for ( k = 0; k < nz; k++ ) {
				__m128 f = _mm_load_ss(kf + k), t0, t1;
				f = _mm_shuffle_ps(f, f, 0);
				const float* S = src[k] + i;

				t0 = _mm_loadu_ps(S);
				t1 = _mm_loadu_ps(S + 4);
				s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
				s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));

				t0 = _mm_loadu_ps(S + 8);
				t1 = _mm_loadu_ps(S + 12);
				s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
				s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
			}

			_mm_storeu_ps(dst + i, s0);
			_mm_storeu_ps(dst + i + 4, s1);
			_mm_storeu_ps(dst + i + 8, s2);
			_mm_storeu_ps(dst + i + 12, s3);
		}

		for ( ; i <= width - 4; i += 4 ) {
			__m128 s0 = d4;

			for ( k = 0; k < nz; k++ ) {
				__m128 f = _mm_load_ss(kf + k), t0;
				f = _mm_shuffle_ps(f, f, 0);
				t0 = _mm_loadu_ps(src[k] + i);
				s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
			}
			_mm_storeu_ps(dst + i, s0);
		}

		return i;
	}

	int _nz;
	vector<uchar> coeffs;
	float delta;
};


#else

typedef RowNoVec RowVec_8u32s;
typedef RowNoVec RowVec_32f;
typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
typedef ColumnNoVec SymmColumnVec_32s8u;
typedef ColumnNoVec SymmColumnVec_32f;
typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
typedef FilterNoVec FilterVec_8u;
typedef FilterNoVec FilterVec_8u16s;
typedef FilterNoVec FilterVec_32f;

#endif


template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter {
	RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp = VecOp() ) {
		if ( _kernel.isContinuous() ) {
			kernel = _kernel;
		} else {
			_kernel.copyTo(kernel);
		}
		anchor = _anchor;
		ksize = kernel.rows + kernel.cols - 1;
		CV_Assert( kernel.type() == DataType<DT>::type &&
				   (kernel.rows == 1 || kernel.cols == 1));
		vecOp = _vecOp;
	}

	void operator()(const uchar* src, uchar* dst, int width, int cn) {
		int _ksize = ksize;
		const DT* kx = (const DT*)kernel.data;
		const ST* S;
		DT* D = (DT*)dst;
		int i, k;

		i = vecOp(src, dst, width, cn);
		width *= cn;

		for ( ; i <= width - 4; i += 4 ) {
			S = (const ST*)src + i;
			DT f = kx[0];
			DT s0 = f * S[0], s1 = f * S[1], s2 = f * S[2], s3 = f * S[3];

			for ( k = 1; k < _ksize; k++ ) {
				S += cn;
				f = kx[k];
				s0 += f * S[0]; s1 += f * S[1];
				s2 += f * S[2]; s3 += f * S[3];
			}

			D[i] = s0; D[i+1] = s1;
			D[i+2] = s2; D[i+3] = s3;
		}

		for ( ; i < width; i++ ) {
			S = (const ST*)src + i;
			DT s0 = kx[0] * S[0];
			for ( k = 1; k < _ksize; k++ ) {
				S += cn;
				s0 += kx[k] * S[0];
			}
			D[i] = s0;
		}
	}

	Mat kernel;
	VecOp vecOp;
};


template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
	public RowFilter<ST, DT, VecOp> {
	SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
						const VecOp& _vecOp = VecOp())
		: RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp ) {
		symmetryType = _symmetryType;
		CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
	}

	void operator()(const uchar* src, uchar* dst, int width, int cn) {
		int ksize2 = this->ksize / 2, ksize2n = ksize2 * cn;
		const DT* kx = (const DT*)this->kernel.data + ksize2;
		bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
		DT* D = (DT*)dst;
		int i = this->vecOp(src, dst, width, cn), j, k;
		const ST* S = (const ST*)src + i + ksize2n;
		width *= cn;

		if ( symmetrical ) {
			if ( this->ksize == 1 && kx[0] == 1 ) {
				for ( ; i <= width - 2; i += 2 ) {
					DT s0 = S[i], s1 = S[i+1];
					D[i] = s0; D[i+1] = s1;
				}
				S += i;
			} else if ( this->ksize == 3 ) {
				if ( kx[0] == 2 && kx[1] == 1 )
					for ( ; i <= width - 2; i += 2, S += 2 ) {
						DT s0 = S[-cn] + S[0] * 2 + S[cn], s1 = S[1-cn] + S[1] * 2 + S[1+cn];
						D[i] = s0; D[i+1] = s1;
					}
				else if ( kx[0] == -2 && kx[1] == 1 )
					for ( ; i <= width - 2; i += 2, S += 2 ) {
						DT s0 = S[-cn] - S[0] * 2 + S[cn], s1 = S[1-cn] - S[1] * 2 + S[1+cn];
						D[i] = s0; D[i+1] = s1;
					}
				else {
					DT k0 = kx[0], k1 = kx[1];
					for ( ; i <= width - 2; i += 2, S += 2 ) {
						DT s0 = S[0] * k0 + (S[-cn] + S[cn]) * k1, s1 = S[1] * k0 + (S[1-cn] + S[1+cn]) * k1;
						D[i] = s0; D[i+1] = s1;
					}
				}
			} else if ( this->ksize == 5 ) {
				DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
				if ( k0 == -2 && k1 == 0 && k2 == 1 )
					for ( ; i <= width - 2; i += 2, S += 2 ) {
						DT s0 = -2 * S[0] + S[-cn*2] + S[cn*2];
						DT s1 = -2 * S[1] + S[1-cn*2] + S[1+cn*2];
						D[i] = s0; D[i+1] = s1;
					}
				else
					for ( ; i <= width - 2; i += 2, S += 2 ) {
						DT s0 = S[0] * k0 + (S[-cn] + S[cn]) * k1 + (S[-cn*2] + S[cn*2]) * k2;
						DT s1 = S[1] * k0 + (S[1-cn] + S[1+cn]) * k1 + (S[1-cn*2] + S[1+cn*2]) * k2;
						D[i] = s0; D[i+1] = s1;
					}
			}

			for ( ; i < width; i++, S++ ) {
				DT s0 = kx[0] * S[0];
				for ( k = 1, j = cn; k <= ksize2; k++, j += cn ) {
					s0 += kx[k] * (S[j] + S[-j]);
				}
				D[i] = s0;
			}
		} else {
			if ( this->ksize == 3 ) {
				if ( kx[0] == 0 && kx[1] == 1 )
					for ( ; i <= width - 2; i += 2, S += 2 ) {
						DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
						D[i] = s0; D[i+1] = s1;
					}
				else {
					DT k1 = kx[1];
					for ( ; i <= width - 2; i += 2, S += 2 ) {
						DT s0 = (S[cn] - S[-cn]) * k1, s1 = (S[1+cn] - S[1-cn]) * k1;
						D[i] = s0; D[i+1] = s1;
					}
				}
			} else if ( this->ksize == 5 ) {
				DT k1 = kx[1], k2 = kx[2];
				for ( ; i <= width - 2; i += 2, S += 2 ) {
					DT s0 = (S[cn] - S[-cn]) * k1 + (S[cn*2] - S[-cn*2]) * k2;
					DT s1 = (S[1+cn] - S[1-cn]) * k1 + (S[1+cn*2] - S[1-cn*2]) * k2;
					D[i] = s0; D[i+1] = s1;
				}
			}

			for ( ; i < width; i++, S++ ) {
				DT s0 = kx[0] * S[0];
				for ( k = 1, j = cn; k <= ksize2; k++, j += cn ) {
					s0 += kx[k] * (S[j] - S[-j]);
				}
				D[i] = s0;
			}
		}
	}

	int symmetryType;
};


template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter {
	typedef typename CastOp::type1 ST;
	typedef typename CastOp::rtype DT;

	ColumnFilter( const Mat& _kernel, int _anchor,
				  double _delta, const CastOp& _castOp = CastOp(),
				  const VecOp& _vecOp = VecOp() ) {
		if ( _kernel.isContinuous() ) {
			kernel = _kernel;
		} else {
			_kernel.copyTo(kernel);
		}
		anchor = _anchor;
		ksize = kernel.rows + kernel.cols - 1;
		delta = saturate_cast<ST>(_delta);
		castOp0 = _castOp;
		vecOp = _vecOp;
		CV_Assert( kernel.type() == DataType<ST>::type &&
				   (kernel.rows == 1 || kernel.cols == 1));
	}

	void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) {
		const ST* ky = (const ST*)kernel.data;
		ST _delta = delta;
		int _ksize = ksize;
		int i, k;
		CastOp castOp = castOp0;

		for ( ; count--; dst += dststep, src++ ) {
			DT* D = (DT*)dst;
			i = vecOp(src, dst, width);
			for ( ; i <= width - 4; i += 4 ) {
				ST f = ky[0];
				const ST* S = (const ST*)src[0] + i;
				ST s0 = f * S[0] + _delta, s1 = f * S[1] + _delta,
				   s2 = f * S[2] + _delta, s3 = f * S[3] + _delta;

				for ( k = 1; k < _ksize; k++ ) {
					S = (const ST*)src[k] + i; f = ky[k];
					s0 += f * S[0]; s1 += f * S[1];
					s2 += f * S[2]; s3 += f * S[3];
				}

				D[i] = castOp(s0); D[i+1] = castOp(s1);
				D[i+2] = castOp(s2); D[i+3] = castOp(s3);
			}

			for ( ; i < width; i++ ) {
				ST s0 = ky[0] * ((const ST*)src[0])[i] + _delta;
				for ( k = 1; k < _ksize; k++ ) {
					s0 += ky[k] * ((const ST*)src[k])[i];
				}
				D[i] = castOp(s0);
			}
		}
	}

	Mat kernel;
	CastOp castOp0;
	VecOp vecOp;
	ST delta;
};


template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp> {
	typedef typename CastOp::type1 ST;
	typedef typename CastOp::rtype DT;

	SymmColumnFilter( const Mat& _kernel, int _anchor,
					  double _delta, int _symmetryType,
					  const CastOp& _castOp = CastOp(),
					  const VecOp& _vecOp = VecOp())
		: ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp ) {
		symmetryType = _symmetryType;
		CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
	}

	void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) {
		int ksize2 = this->ksize / 2;
		const ST* ky = (const ST*)this->kernel.data + ksize2;
		int i, k;
		bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
		ST _delta = this->delta;
		CastOp castOp = this->castOp0;
		src += ksize2;

		if ( symmetrical ) {
			for ( ; count--; dst += dststep, src++ ) {
				DT* D = (DT*)dst;
				i = (this->vecOp)(src, dst, width);

				for ( ; i <= width - 4; i += 4 ) {
					ST f = ky[0];
					const ST* S = (const ST*)src[0] + i, *S2;
					ST s0 = f * S[0] + _delta, s1 = f * S[1] + _delta,
					   s2 = f * S[2] + _delta, s3 = f * S[3] + _delta;

					for ( k = 1; k <= ksize2; k++ ) {
						S = (const ST*)src[k] + i;
						S2 = (const ST*)src[-k] + i;
						f = ky[k];
						s0 += f * (S[0] + S2[0]);
						s1 += f * (S[1] + S2[1]);
						s2 += f * (S[2] + S2[2]);
						s3 += f * (S[3] + S2[3]);
					}

					D[i] = castOp(s0); D[i+1] = castOp(s1);
					D[i+2] = castOp(s2); D[i+3] = castOp(s3);
				}

				for ( ; i < width; i++ ) {
					ST s0 = ky[0] * ((const ST*)src[0])[i] + _delta;
					for ( k = 1; k <= ksize2; k++ ) {
						s0 += ky[k] * (((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
					}
					D[i] = castOp(s0);
				}
			}
		} else {
			for ( ; count--; dst += dststep, src++ ) {
				DT* D = (DT*)dst;
				i = this->vecOp(src, dst, width);

				for ( ; i <= width - 4; i += 4 ) {
					ST f = ky[0];
					const ST* S, *S2;
					ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;

					for ( k = 1; k <= ksize2; k++ ) {
						S = (const ST*)src[k] + i;
						S2 = (const ST*)src[-k] + i;
						f = ky[k];
						s0 += f * (S[0] - S2[0]);
						s1 += f * (S[1] - S2[1]);
						s2 += f * (S[2] - S2[2]);
						s3 += f * (S[3] - S2[3]);
					}

					D[i] = castOp(s0); D[i+1] = castOp(s1);
					D[i+2] = castOp(s2); D[i+3] = castOp(s3);
				}

				for ( ; i < width; i++ ) {
					ST s0 = _delta;
					for ( k = 1; k <= ksize2; k++ ) {
						s0 += ky[k] * (((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
					}
					D[i] = castOp(s0);
				}
			}
		}
	}

	int symmetryType;
};


template<class CastOp, class VecOp>
struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp> {
	typedef typename CastOp::type1 ST;
	typedef typename CastOp::rtype DT;

	SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
						   double _delta, int _symmetryType,
						   const CastOp& _castOp = CastOp(),
						   const VecOp& _vecOp = VecOp())
		: SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp ) {
		CV_Assert( this->ksize == 3 );
	}

	void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) {
		int ksize2 = this->ksize / 2;
		const ST* ky = (const ST*)this->kernel.data + ksize2;
		int i;
		bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
		bool is_1_2_1 = ky[0] == 1 && ky[1] == 2;
		bool is_1_m2_1 = ky[0] == 1 && ky[1] == -2;
		bool is_m1_0_1 = ky[1] == 1 || ky[1] == -1;
		ST f0 = ky[0], f1 = ky[1];
		ST _delta = this->delta;
		CastOp castOp = this->castOp0;
		src += ksize2;

		for ( ; count--; dst += dststep, src++ ) {
			DT* D = (DT*)dst;
			i = (this->vecOp)(src, dst, width);
			const ST* S0 = (const ST*)src[-1];
			const ST* S1 = (const ST*)src[0];
			const ST* S2 = (const ST*)src[1];

			if ( symmetrical ) {
				if ( is_1_2_1 ) {
					for ( ; i <= width - 4; i += 4 ) {
						ST s0 = S0[i] + S1[i] * 2 + S2[i] + _delta;
						ST s1 = S0[i+1] + S1[i+1] * 2 + S2[i+1] + _delta;
						D[i] = castOp(s0);
						D[i+1] = castOp(s1);

						s0 = S0[i+2] + S1[i+2] * 2 + S2[i+2] + _delta;
						s1 = S0[i+3] + S1[i+3] * 2 + S2[i+3] + _delta;
						D[i+2] = castOp(s0);
						D[i+3] = castOp(s1);
					}
				} else if ( is_1_m2_1 ) {
					for ( ; i <= width - 4; i += 4 ) {
						ST s0 = S0[i] - S1[i] * 2 + S2[i] + _delta;
						ST s1 = S0[i+1] - S1[i+1] * 2 + S2[i+1] + _delta;
						D[i] = castOp(s0);
						D[i+1] = castOp(s1);

						s0 = S0[i+2] - S1[i+2] * 2 + S2[i+2] + _delta;
						s1 = S0[i+3] - S1[i+3] * 2 + S2[i+3] + _delta;
						D[i+2] = castOp(s0);
						D[i+3] = castOp(s1);
					}
				} else {
					for ( ; i <= width - 4; i += 4 ) {
						ST s0 = (S0[i] + S2[i]) * f1 + S1[i] * f0 + _delta;
						ST s1 = (S0[i+1] + S2[i+1]) * f1 + S1[i+1] * f0 + _delta;
						D[i] = castOp(s0);
						D[i+1] = castOp(s1);

						s0 = (S0[i+2] + S2[i+2]) * f1 + S1[i+2] * f0 + _delta;
						s1 = (S0[i+3] + S2[i+3]) * f1 + S1[i+3] * f0 + _delta;
						D[i+2] = castOp(s0);
						D[i+3] = castOp(s1);
					}
				}

				for ( ; i < width; i++ ) {
					D[i] = castOp((S0[i] + S2[i]) * f1 + S1[i] * f0 + _delta);
				}
			} else {
				if ( is_m1_0_1 ) {
					if ( f1 < 0 ) {
						std::swap(S0, S2);
					}

					for ( ; i <= width - 4; i += 4 ) {
						ST s0 = S2[i] - S0[i] + _delta;
						ST s1 = S2[i+1] - S0[i+1] + _delta;
						D[i] = castOp(s0);
						D[i+1] = castOp(s1);

						s0 = S2[i+2] - S0[i+2] + _delta;
						s1 = S2[i+3] - S0[i+3] + _delta;
						D[i+2] = castOp(s0);
						D[i+3] = castOp(s1);
					}

					if ( f1 < 0 ) {
						std::swap(S0, S2);
					}
				} else {
					for ( ; i <= width - 4; i += 4 ) {
						ST s0 = (S2[i] - S0[i]) * f1 + _delta;
						ST s1 = (S2[i+1] - S0[i+1]) * f1 + _delta;
						D[i] = castOp(s0);
						D[i+1] = castOp(s1);

						s0 = (S2[i+2] - S0[i+2]) * f1 + _delta;
						s1 = (S2[i+3] - S0[i+3]) * f1 + _delta;
						D[i+2] = castOp(s0);
						D[i+3] = castOp(s1);
					}
				}

				for ( ; i < width; i++ ) {
					D[i] = castOp((S2[i] - S0[i]) * f1 + _delta);
				}
			}
		}
	}
};

template<typename ST, typename DT> struct Cast {
	typedef ST type1;
	typedef DT rtype;

	DT operator()(ST val) const { return saturate_cast<DT>(val); }
};

template<typename ST, typename DT, int bits> struct FixedPtCast {
	typedef ST type1;
	typedef DT rtype;
	enum { SHIFT = bits, DELTA = 1 << (bits - 1) };

	DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA) >> SHIFT); }
};

template<typename ST, typename DT> struct FixedPtCastEx {
	typedef ST type1;
	typedef DT rtype;

	FixedPtCastEx() : SHIFT(0), DELTA(0) {}
	FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits - 1) : 0) {}
	DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA) >> SHIFT); }
	int SHIFT, DELTA;
};

Ptr<BaseRowFilter> getLinearRowFilter( int srcType, int bufType,
									   const Mat& kernel, int anchor,
									   int symmetryType ) {
	int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
	int cn = CV_MAT_CN(srcType);
	CV_Assert( cn == CV_MAT_CN(bufType) &&
			   ddepth >= std::max(sdepth, CV_32S) &&
			   kernel.type() == ddepth );
	int ksize = kernel.rows + kernel.cols - 1;

	if ( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 ) {
		if ( sdepth == CV_8U && ddepth == CV_32S )
			return Ptr<BaseRowFilter>(new SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s>
									  (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)));
		if ( sdepth == CV_32F && ddepth == CV_32F )
			return Ptr<BaseRowFilter>(new SymmRowSmallFilter<float, float, SymmRowSmallVec_32f>
									  (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)));
	}

	if ( sdepth == CV_8U && ddepth == CV_32S )
		return Ptr<BaseRowFilter>(new RowFilter<uchar, int, RowVec_8u32s>
								  (kernel, anchor, RowVec_8u32s(kernel)));
	if ( sdepth == CV_8U && ddepth == CV_32F ) {
		return Ptr<BaseRowFilter>(new RowFilter<uchar, float, RowNoVec>(kernel, anchor));
	}
	if ( sdepth == CV_8U && ddepth == CV_64F ) {
		return Ptr<BaseRowFilter>(new RowFilter<uchar, double, RowNoVec>(kernel, anchor));
	}
	if ( sdepth == CV_16U && ddepth == CV_32F ) {
		return Ptr<BaseRowFilter>(new RowFilter<ushort, float, RowNoVec>(kernel, anchor));
	}
	if ( sdepth == CV_16U && ddepth == CV_64F ) {
		return Ptr<BaseRowFilter>(new RowFilter<ushort, double, RowNoVec>(kernel, anchor));
	}
	if ( sdepth == CV_16S && ddepth == CV_32F ) {
		return Ptr<BaseRowFilter>(new RowFilter<short, float, RowNoVec>(kernel, anchor));
	}
	if ( sdepth == CV_16S && ddepth == CV_64F ) {
		return Ptr<BaseRowFilter>(new RowFilter<short, double, RowNoVec>(kernel, anchor));
	}
	if ( sdepth == CV_32F && ddepth == CV_32F )
		return Ptr<BaseRowFilter>(new RowFilter<float, float, RowVec_32f>
								  (kernel, anchor, RowVec_32f(kernel)));
	if ( sdepth == CV_64F && ddepth == CV_64F ) {
		return Ptr<BaseRowFilter>(new RowFilter<double, double, RowNoVec>(kernel, anchor));
	}

	CV_Error_( CV_StsNotImplemented,
			   ("Unsupported combination of source format (=%d), and buffer format (=%d)",
				srcType, bufType));

	return Ptr<BaseRowFilter>(0);
}


Ptr<BaseColumnFilter> getLinearColumnFilter( int bufType, int dstType,
		const Mat& kernel, int anchor,
		int symmetryType, double delta,
		int bits ) {
	int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
	int cn = CV_MAT_CN(dstType);
	CV_Assert( cn == CV_MAT_CN(bufType) &&
			   sdepth >= std::max(ddepth, CV_32S) &&
			   kernel.type() == sdepth );

	if ( !(symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) ) {
		if ( ddepth == CV_8U && sdepth == CV_32S )
			return Ptr<BaseColumnFilter>(new ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec>
										 (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits)));
		if ( ddepth == CV_8U && sdepth == CV_32F ) {
			return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, uchar>, ColumnNoVec>(kernel, anchor, delta));
		}
		if ( ddepth == CV_8U && sdepth == CV_64F ) {
			return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, uchar>, ColumnNoVec>(kernel, anchor, delta));
		}
		if ( ddepth == CV_16U && sdepth == CV_32F ) {
			return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, ushort>, ColumnNoVec>(kernel, anchor, delta));
		}
		if ( ddepth == CV_16U && sdepth == CV_64F ) {
			return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, ushort>, ColumnNoVec>(kernel, anchor, delta));
		}
		if ( ddepth == CV_16S && sdepth == CV_32F ) {
			return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, short>, ColumnNoVec>(kernel, anchor, delta));
		}
		if ( ddepth == CV_16S && sdepth == CV_64F ) {
			return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, short>, ColumnNoVec>(kernel, anchor, delta));
		}
		if ( ddepth == CV_32F && sdepth == CV_32F ) {
			return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<float, float>, ColumnNoVec>(kernel, anchor, delta));
		}
		if ( ddepth == CV_64F && sdepth == CV_64F ) {
			return Ptr<BaseColumnFilter>(new ColumnFilter<Cast<double, double>, ColumnNoVec>(kernel, anchor, delta));
		}
	} else {
		int ksize = kernel.rows + kernel.cols - 1;
		if ( ksize == 3 ) {
			if ( ddepth == CV_8U && sdepth == CV_32S )
				return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter <
											 FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u >
											 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
											  SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
			if ( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
				return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter < Cast<int, short>,
											 SymmColumnSmallVec_32s16s > (kernel, anchor, delta, symmetryType,
													 Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)));
			if ( ddepth == CV_32F && sdepth == CV_32F )
				return Ptr<BaseColumnFilter>(new SymmColumnSmallFilter <
											 Cast<float, float>, SymmColumnSmallVec_32f >
											 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
											  SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)));
		}
		if ( ddepth == CV_8U && sdepth == CV_32S )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u>
										 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
										  SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)));
		if ( ddepth == CV_8U && sdepth == CV_32F )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, uchar>, ColumnNoVec>
										 (kernel, anchor, delta, symmetryType));
		if ( ddepth == CV_8U && sdepth == CV_64F )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, uchar>, ColumnNoVec>
										 (kernel, anchor, delta, symmetryType));
		if ( ddepth == CV_16U && sdepth == CV_32F )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, ushort>, ColumnNoVec>
										 (kernel, anchor, delta, symmetryType));
		if ( ddepth == CV_16U && sdepth == CV_64F )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, ushort>, ColumnNoVec>
										 (kernel, anchor, delta, symmetryType));
		if ( ddepth == CV_16S && sdepth == CV_32S )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<int, short>, ColumnNoVec>
										 (kernel, anchor, delta, symmetryType));
		if ( ddepth == CV_16S && sdepth == CV_32F )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, short>, ColumnNoVec>
										 (kernel, anchor, delta, symmetryType));
		if ( ddepth == CV_16S && sdepth == CV_64F )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, short>, ColumnNoVec>
										 (kernel, anchor, delta, symmetryType));
		if ( ddepth == CV_32F && sdepth == CV_32F )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f>
										 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
										  SymmColumnVec_32f(kernel, symmetryType, 0, delta)));
		if ( ddepth == CV_64F && sdepth == CV_64F )
			return Ptr<BaseColumnFilter>(new SymmColumnFilter<Cast<double, double>, ColumnNoVec>
										 (kernel, anchor, delta, symmetryType));
	}

	CV_Error_( CV_StsNotImplemented,
			   ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
				bufType, dstType));

	return Ptr<BaseColumnFilter>(0);
}


Ptr<FilterEngine> createSeparableLinearFilter(
	int _srcType, int _dstType,
	const Mat& _rowKernel, const Mat& _columnKernel,
	Point _anchor, double _delta,
	int _rowBorderType, int _columnBorderType,
	const Scalar& _borderValue ) {
	_srcType = CV_MAT_TYPE(_srcType);
	_dstType = CV_MAT_TYPE(_dstType);
	int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
	int cn = CV_MAT_CN(_srcType);
	CV_Assert( cn == CV_MAT_CN(_dstType) );
	int rsize = _rowKernel.rows + _rowKernel.cols - 1;
	int csize = _columnKernel.rows + _columnKernel.cols - 1;
	if ( _anchor.x < 0 ) {
		_anchor.x = rsize / 2;
	}
	if ( _anchor.y < 0 ) {
		_anchor.y = csize / 2;
	}
	int rtype = getKernelType(_rowKernel,
							  _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
	int ctype = getKernelType(_columnKernel,
							  _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
	Mat rowKernel, columnKernel;

	int bdepth = std::max(CV_32F, std::max(sdepth, ddepth));
	int bits = 0;

	if ( sdepth == CV_8U &&
			((rtype == KERNEL_SMOOTH + KERNEL_SYMMETRICAL &&
			  ctype == KERNEL_SMOOTH + KERNEL_SYMMETRICAL &&
			  ddepth == CV_8U) ||
			 ((rtype & (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL)) &&
			  (ctype & (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL)) &&
			  (rtype & ctype & KERNEL_INTEGER) &&
			  ddepth == CV_16S)) ) {
		bdepth = CV_32S;
		bits = ddepth == CV_8U ? 8 : 0;
		_rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
		_columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
		bits *= 2;
		_delta *= (1 << bits);
	} else {
		if ( _rowKernel.type() != bdepth ) {
			_rowKernel.convertTo( rowKernel, bdepth );
		} else {
			rowKernel = _rowKernel;
		}
		if ( _columnKernel.type() != bdepth ) {
			_columnKernel.convertTo( columnKernel, bdepth );
		} else {
			columnKernel = _columnKernel;
		}
	}

	int _bufType = CV_MAKETYPE(bdepth, cn);
	Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
										_srcType, _bufType, rowKernel, _anchor.x, rtype);
	Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
			_bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );

	return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(0), _rowFilter, _columnFilter,
							  _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
}


/****************************************************************************************\
*                               Non-separable linear filter                              *
\****************************************************************************************/

void preprocess2DKernel( const Mat& kernel, vector<Point>& coords, vector<uchar>& coeffs ) {
	int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
	if (nz == 0) {
		nz = 1;
	}
	CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
	coords.resize(nz);
	coeffs.resize(nz * getElemSize(ktype));
	uchar* _coeffs = &coeffs[0];

	for ( i = k = 0; i < kernel.rows; i++ ) {
		const uchar* krow = kernel.data + kernel.step * i;
		for ( j = 0; j < kernel.cols; j++ ) {
			if ( ktype == CV_8U ) {
				uchar val = krow[j];
				if ( val == 0 ) {
					continue;
				}
				coords[k] = Point(j, i);
				_coeffs[k++] = val;
			} else if ( ktype == CV_32S ) {
				int val = ((const int*)krow)[j];
				if ( val == 0 ) {
					continue;
				}
				coords[k] = Point(j, i);
				((int*)_coeffs)[k++] = val;
			} else if ( ktype == CV_32F ) {
				float val = ((const float*)krow)[j];
				if ( val == 0 ) {
					continue;
				}
				coords[k] = Point(j, i);
				((float*)_coeffs)[k++] = val;
			} else {
				double val = ((const double*)krow)[j];
				if ( val == 0 ) {
					continue;
				}
				coords[k] = Point(j, i);
				((double*)_coeffs)[k++] = val;
			}
		}
	}
}


template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter {
	typedef typename CastOp::type1 KT;
	typedef typename CastOp::rtype DT;

	Filter2D( const Mat& _kernel, Point _anchor,
			  double _delta, const CastOp& _castOp = CastOp(),
			  const VecOp& _vecOp = VecOp() ) {
		anchor = _anchor;
		ksize = _kernel.size();
		delta = saturate_cast<KT>(_delta);
		castOp0 = _castOp;
		vecOp = _vecOp;
		CV_Assert( _kernel.type() == DataType<KT>::type );
		preprocess2DKernel( _kernel, coords, coeffs );
		ptrs.resize( coords.size() );
	}

	void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) {
		KT _delta = delta;
		const Point* pt = &coords[0];
		const KT* kf = (const KT*)&coeffs[0];
		const ST** kp = (const ST**)&ptrs[0];
		int i, k, nz = (int)coords.size();
		CastOp castOp = castOp0;

		width *= cn;
		for ( ; count > 0; count--, dst += dststep, src++ ) {
			DT* D = (DT*)dst;

			for ( k = 0; k < nz; k++ ) {
				kp[k] = (const ST*)src[pt[k].y] + pt[k].x * cn;
			}

			i = vecOp((const uchar**)kp, dst, width);

			for ( ; i <= width - 4; i += 4 ) {
				KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;

				for ( k = 0; k < nz; k++ ) {
					const ST* sptr = kp[k] + i;
					KT f = kf[k];
					s0 += f * sptr[0];
					s1 += f * sptr[1];
					s2 += f * sptr[2];
					s3 += f * sptr[3];
				}

				D[i] = castOp(s0); D[i+1] = castOp(s1);
				D[i+2] = castOp(s2); D[i+3] = castOp(s3);
			}

			for ( ; i < width; i++ ) {
				KT s0 = _delta;
				for ( k = 0; k < nz; k++ ) {
					s0 += kf[k] * kp[k][i];
				}
				D[i] = castOp(s0);
			}
		}
	}

	vector<Point> coords;
	vector<uchar> coeffs;
	vector<uchar*> ptrs;
	KT delta;
	CastOp castOp0;
	VecOp vecOp;
};


Ptr<BaseFilter> getLinearFilter(int srcType, int dstType,
								const Mat& _kernel, Point anchor,
								double delta, int bits) {
	int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
	int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
	CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );

	anchor = normalizeAnchor(anchor, _kernel.size());

	if ( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
		return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u>
							   (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
								FilterVec_8u(_kernel, bits, delta)));
	if ( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
		return Ptr<BaseFilter>(new Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s>
							   (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
								FilterVec_8u16s(_kernel, bits, delta)));

	kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
	Mat kernel;
	if ( _kernel.type() == kdepth ) {
		kernel = _kernel;
	} else {
		_kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1. / (1 << bits) : 1.);
	}

	if ( sdepth == CV_8U && ddepth == CV_8U )
		return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, uchar>, FilterVec_8u>
							   (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta)));
	if ( sdepth == CV_8U && ddepth == CV_16U )
		return Ptr<BaseFilter>(new Filter2D < uchar,
							   Cast<float, ushort>, FilterNoVec > (kernel, anchor, delta));
	if ( sdepth == CV_8U && ddepth == CV_16S )
		return Ptr<BaseFilter>(new Filter2D<uchar, Cast<float, short>, FilterVec_8u16s>
							   (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta)));
	if ( sdepth == CV_8U && ddepth == CV_32F )
		return Ptr<BaseFilter>(new Filter2D < uchar,
							   Cast<float, float>, FilterNoVec > (kernel, anchor, delta));
	if ( sdepth == CV_8U && ddepth == CV_64F )
		return Ptr<BaseFilter>(new Filter2D < uchar,
							   Cast<double, double>, FilterNoVec > (kernel, anchor, delta));

	if ( sdepth == CV_16U && ddepth == CV_16U )
		return Ptr<BaseFilter>(new Filter2D < ushort,
							   Cast<float, ushort>, FilterNoVec > (kernel, anchor, delta));
	if ( sdepth == CV_16U && ddepth == CV_32F )
		return Ptr<BaseFilter>(new Filter2D < ushort,
							   Cast<float, float>, FilterNoVec > (kernel, anchor, delta));
	if ( sdepth == CV_16U && ddepth == CV_64F )
		return Ptr<BaseFilter>(new Filter2D < ushort,
							   Cast<double, double>, FilterNoVec > (kernel, anchor, delta));

	if ( sdepth == CV_16S && ddepth == CV_16S )
		return Ptr<BaseFilter>(new Filter2D < short,
							   Cast<float, short>, FilterNoVec > (kernel, anchor, delta));
	if ( sdepth == CV_16S && ddepth == CV_32F )
		return Ptr<BaseFilter>(new Filter2D < short,
							   Cast<float, float>, FilterNoVec > (kernel, anchor, delta));
	if ( sdepth == CV_16S && ddepth == CV_64F )
		return Ptr<BaseFilter>(new Filter2D < short,
							   Cast<double, double>, FilterNoVec > (kernel, anchor, delta));

	if ( sdepth == CV_32F && ddepth == CV_32F )
		return Ptr<BaseFilter>(new Filter2D<float, Cast<float, float>, FilterVec_32f>
							   (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta)));
	if ( sdepth == CV_64F && ddepth == CV_64F )
		return Ptr<BaseFilter>(new Filter2D < double,
							   Cast<double, double>, FilterNoVec > (kernel, anchor, delta));

	CV_Error_( CV_StsNotImplemented,
			   ("Unsupported combination of source format (=%d), and destination format (=%d)",
				srcType, dstType));

	return Ptr<BaseFilter>(0);
}


Ptr<FilterEngine> createLinearFilter( int _srcType, int _dstType, const Mat& _kernel,
									  Point _anchor, double _delta,
									  int _rowBorderType, int _columnBorderType,
									  const Scalar& _borderValue ) {
	_srcType = CV_MAT_TYPE(_srcType);
	_dstType = CV_MAT_TYPE(_dstType);
	int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
	int cn = CV_MAT_CN(_srcType);
	CV_Assert( cn == CV_MAT_CN(_dstType) );

	Mat kernel = _kernel;
	int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
	int bits = 0;

	if ( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
			_kernel.rows* _kernel.cols <= (1 << 10) ) {
		bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
		_kernel.convertTo(kernel, CV_32S, 1 << bits);
	}

	Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
								kernel, _anchor, _delta, bits);

	return Ptr<FilterEngine>(new FilterEngine(_filter2D, Ptr<BaseRowFilter>(0),
							 Ptr<BaseColumnFilter>(0), _srcType, _dstType, _srcType,
							 _rowBorderType, _columnBorderType, _borderValue ));
}


void filter2D( const Mat& src, Mat& dst, int ddepth,
			   const Mat& kernel, Point anchor,
			   double delta, int borderType ) {
	if ( ddepth < 0 ) {
		ddepth = src.depth();
	}

#if CV_SSE2
	int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
						   (src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3) ? 130 : 50;
#else
	int dft_filter_size = 50;
#endif

	dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
	anchor = normalizeAnchor(anchor, kernel.size());

	if ( kernel.cols* kernel.rows >= dft_filter_size /*&&
        kernel.cols <= src.cols && kernel.rows <= src.rows*/ ) {
		Mat temp;
		if ( src.data != dst.data ) {
			temp = src;
		} else {
			src.copyTo(temp);
		}
		crossCorr( temp, kernel, dst, anchor, delta, borderType );
		return;
	}

	Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
						  anchor, delta, borderType );
	f->apply(src, dst);
}


void sepFilter2D( const Mat& src, Mat& dst, int ddepth,
				  const Mat& kernelX, const Mat& kernelY, Point anchor,
				  double delta, int borderType ) {
	if ( ddepth < 0 ) {
		ddepth = src.depth();
	}

	dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );

	Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
						  dst.type(), kernelX, kernelY, anchor, delta, borderType );
	f->apply(src, dst);
}

}


CV_IMPL void
cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor ) {
	cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
	cv::Mat kernel = cv::cvarrToMat(_kernel);

	CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );

	cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );
}

/* End of file. */
